This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
# install.packages("ggplot2")
# install.packages("skimr")
# install.packages("plotly")
# install.packages("rsample")
# install.packages("patchwork")
library(ggplot2)
library(skimr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(rsample)
library(patchwork) # For combining plots
signals_df <- read.csv("./data.csv")
time_df <- read.csv("./time.csv", header = FALSE, skip = 1)
colnames(time_df)<-c("time")
signals_df <- cbind(time_df, signals_df)
head(signals_df, 5)
## time x1 x2 x3 x4 x5
## 1 0.002 -0.742 -0.777 -1.48 -2.37 -1.400
## 2 0.004 0.706 -3.120 1.60 -2.14 -4.990
## 3 0.006 -1.340 -1.550 -1.74 -1.96 -0.458
## 4 0.008 -2.080 -0.373 -3.08 -1.48 2.600
## 5 0.010 1.230 1.090 2.44 1.94 -1.580
tail(signals_df, 5)
## time x1 x2 x3 x4 x5
## 196 0.392 -0.478 -0.927 -0.855 -1.410 -2.24
## 197 0.394 1.920 2.710 1.120 1.780 3.22
## 198 0.396 0.262 -1.630 -0.466 -2.330 -2.30
## 199 0.398 1.360 0.919 0.471 -0.475 -1.41
## 200 0.400 -1.630 -1.020 -1.550 -1.190 -1.10
skim(signals_df)
| Name | signals_df |
| Number of rows | 200 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| time | 0 | 1 | 0.20 | 0.12 | 0.00 | 0.10 | 0.20 | 0.30 | 0.40 | ▇▇▇▇▇ |
| x1 | 0 | 1 | -0.06 | 1.48 | -3.38 | -1.11 | -0.06 | 0.73 | 5.34 | ▂▇▇▂▁ |
| x2 | 0 | 1 | -0.16 | 1.69 | -4.57 | -1.22 | -0.03 | 0.99 | 6.00 | ▂▆▇▂▁ |
| x3 | 0 | 1 | -0.01 | 1.58 | -3.76 | -1.11 | 0.08 | 1.00 | 5.42 | ▂▆▇▂▁ |
| x4 | 0 | 1 | -0.20 | 1.59 | -4.36 | -1.28 | -0.22 | 0.75 | 4.94 | ▁▆▇▃▁ |
| x5 | 0 | 1 | -0.30 | 1.87 | -6.44 | -1.41 | -0.24 | 0.73 | 5.76 | ▁▃▇▃▁ |
plot_ly(signals_df, x = ~time, y = ~x1, type = 'scatter', mode = 'lines',
name = 'x1') %>%
add_trace(y = ~x3, name = 'x3') %>%
add_trace(y = ~x4, name = 'x4') %>%
add_trace(y = ~x5, name = 'x5') %>%
layout(title = "Time Series of Input Signals(X)",
xaxis = list(title = "Time (Seconds)"),
yaxis = list(title = "Signal Value"))
x1_time <- plot_ly(signals_df, x = ~time, y = ~x1, type = 'scatter', mode = 'lines', name = 'X1') %>%
layout(title = "Time Series of x1",
xaxis = list(title = "Time (Seconds)",gridcolor = '#ffff'),
yaxis = list(title = "X1", gridcolor = '#ffff', range = list(-4, 4)) )
x3_time <- plot_ly(signals_df, x = ~time, y = ~x3, type = 'scatter', mode = 'lines', name = 'X3') %>%
layout(title = "Time Series of x3",
xaxis = list(title = "Time (Seconds)",gridcolor = '#ffff'),
yaxis = list(title = "X3", gridcolor = '#ffff', range = list(-5, 5)) )
x4_time <- plot_ly(signals_df, x = ~time, y = ~x4, type = 'scatter', mode = 'lines', name = 'X4') %>%
layout(title = "Time Series of x4",
xaxis = list(title = "Time (Seconds)",gridcolor = '#ffff'),
yaxis = list(title = "X4", gridcolor = '#ffff', range = list(-5, 5)) )
x5_time <- plot_ly(signals_df, x = ~time, y = ~x5, type = 'scatter', mode = 'lines', name = 'X5') %>%
layout(title = "Time Series of x5",
xaxis = list(title = "Time (Seconds)",gridcolor = '#ffff'),
yaxis = list(title = "X5", gridcolor = '#ffff', range = list(-5, 5)) )
x1_time
x3_time
x4_time
x5_time
# Plot all using subplots function of plotly
plotly::subplot(x1_time, x3_time, x4_time, x5_time, nrows = 4, shareX = TRUE, titleX =TRUE, titleY = TRUE ) %>%
layout(plot_bgcolor='#F8F4EC', title="Time series analysis of input signals (X1,X3,X4,X5) with time")
x2_time <- plot_ly(signals_df, x = ~time, y = ~x2, type = 'scatter', mode = 'lines', name = 'x2') %>%
layout(title = "Time Series of output signal (X2) with time",
xaxis = list(title = "Time (Seconds)",zerolinewidth = 2,gridcolor = '#ffff'),
yaxis = list(title = "X2 (Output Signal)",zerolinewidth = 2,gridcolor = '#ffff'),
plot_bgcolor="#BCF2F6", paper_bgcolor="#fff", colorway="#334756"
)
x2_time
x1_distribution = ggplot(signals_df, aes(x = x1)) +
geom_histogram(aes(y = after_stat(density) ), bins=10,fill = "#CC2B52") +
stat_density(geom="line", color="#424242", linewidth=.8) +
geom_rug() +
labs(x="X1", y="Density")
x3_distribution = ggplot(signals_df, aes(x = x3)) +
geom_histogram(aes(y = after_stat(density) ), bins=10,fill = "#88C273") +
stat_density(geom="line", color="#424242", linewidth=.8) +
geom_rug() +
labs(x="X3", y="Density")
x4_distribution = ggplot(signals_df, aes(x = x4)) +
geom_histogram(aes(y = after_stat(density) ), bins=10,fill = "#37AFE1") +
stat_density(geom="line", color="#424242", linewidth=.8) +
geom_rug() +
labs(x="X4", y="Density")
x5_distribution = ggplot(signals_df, aes(x = x5)) +
geom_histogram(aes(y = after_stat(density) ), bins=10,fill = "#FF77B7") +
stat_density(geom="line", color="#424242", linewidth=.8) +
geom_rug() +
labs(x="X5", y="Density")
plotly::subplot(x1_distribution, x3_distribution,x4_distribution, x5_distribution, nrows=2, shareX = FALSE,
titleY = TRUE, titleX = TRUE) %>% layout(plot_bgcolor='#F8F4EC', title="Distribution of X1, X3, X4, and X5")
x2_distribution = ggplot(signals_df, aes(x = x2)) +
geom_histogram(aes(y = after_stat(density) ), bins=10,fill = "#5A72A0") +
stat_density(geom="line", color="#1A2130", linewidth=.8) +
geom_rug()
ggplotly(x2_distribution) %>% layout(title="Distribution of Output(X2) Signal",
xaxis = list(title="X2 Signal (Output)"),
yaxis = list(title="Density"))
distribution_df = data.frame(rbind(data.frame(values = signals_df$x1, Inputs = "X1"),
data.frame(values= signals_df$x3, Inputs = "X3"),
data.frame(values= signals_df$x4, Inputs = "X4"),
data.frame(values =signals_df$x5, Inputs = "X5")))
distribution_plot <- ggplot(distribution_df, aes(x = values)) +
geom_histogram(aes(x=values, y = after_stat(density), fill=Inputs), bins = 10, alpha=0.5)+
stat_density(aes(x=values, y = after_stat(density), color=Inputs),geom="line", size=.8) +
geom_rug()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
distribution_plotly <- ggplotly(distribution_plot) %>% layout(plot_bgcolor='#FFF0DC',
title="Distribution of Input (X) Signals ",
xaxis= list(title="Input (X) Signal"),
yaxis = list(title="Density"))
distribution_plotly
x1_correlation = plot_ly(signals_df, x=~x1, y=~x2, type="scatter", mode="markers") %>%
layout(plot_bgcolor='#F8F4EC', title="Corelation of X1 input with X2 output",
xaxis = list(title="X1 Signal (Input)"), yaxis = list(title="X2 Signal (Output)"))
x3_correlation = plot_ly(signals_df, x=~x3, y=~x2, type="scatter", mode="markers") %>%
layout(plot_bgcolor='#F8F4EC', title="Corelation of X3 input to X2 output",
xaxis = list(title="X3 Signal (Input)"), yaxis = list(title="X2 Signal (Output)"))
x4_correlation = plot_ly(signals_df, x=~x4, y=~x2, type="scatter", mode="markers") %>%
layout(plot_bgcolor='#F8F4EC', title="Corelation of X3 input to y output",
xaxis = list(title="X4 Signal (Input)"), yaxis = list(title="X2 Signal (Output)"))
x5_correlation = plot_ly(signals_df, x=~x5, y=~x2, type="scatter", mode="markers") %>%
layout(plot_bgcolor='#F8F4EC', title="Corelation of X5 input to X2 output",
xaxis = list(title="X5 Signal (Input)"), yaxis = list(title="X2 Signal (Output)"))
plotly::subplot(x1_correlation, x3_correlation,x4_correlation, x5_correlation, nrows=2,
shareX = FALSE,
titleY = TRUE, titleX = TRUE) %>% layout(plot_bgcolor='#F8F4EC',
title="Correlation of Input Signals (X1,X3,X4,X5) with output signal (X2)")
plot_ly(signals_df) %>%
add_trace( x=~x1, y=~x2,type = "scatter",mode = "markers", name = "X1") %>%
add_trace( x=~x3, y=~x2,type = "scatter",mode = "markers", name = "X3") %>%
add_trace( x=~x4, y=~x2, type = "scatter",mode = "markers", name = "X4") %>%
add_trace( x=~x5, y=~x2,type = "scatter",mode = "markers", name = "X5") %>%
layout(plot_bgcolor='#F8F4EC', title="Corelation of X Signals(Input) with X2 Signal (Output).",
xaxis = list(title="X1,X3,X4,X5 Signals (Input)"), yaxis = list(title="X2 Signal (Output)"))
# Generate models
# Model 1: x2 = θ1x4 +θ2x3^2 + θbias
generateModel1 <- function(df){
onesMatrix <- matrix(1, length(df$x2), 1)
return (cbind(df$x4, df$x3^2, onesMatrix)) # Design matrix
}
# Model 2: x2 = θ1x4 +θ2x3^2 + θ3x5 + θbias
generateModel2 <- function(df){
onesMatrix <- matrix(1, length(df$x2), 1)
return (cbind(df$x4, df$x3^2, df$x5, onesMatrix)) # Design matrix
}
# Model 3: x2 = θ1x3 +θ2x4 + θ3x5^3
generateModel3 <- function(df){
return (cbind(df$x3, df$x4, df$x5^3)) # Design matrix
}
# Model 4: x2 = θ1x4 +θ2x3^2 + θ3x5^3 + θbias
generateModel4 <- function(df){
onesMatrix <- matrix(1, length(df$x2), 1)
return (cbind(df$x4, df$x3^2, df$x5^3, onesMatrix)) # Design matrix
}
# Model 5: x2 = θ1x4 +θ2x1^2 + θ3x3^2 + θbias
generateModel5 <- function(df){
onesMatrix <- matrix(1, length(df$x2), 1)
return (cbind(df$x4, df$x1^2, df$x3^2, onesMatrix)) # Design matrix
}
thetaHat <- function(model, y){
return (solve(t(model) %*% model) %*% t(model) %*% y)
}
model1 = generateModel1(signals_df)
model1_thetaHat = thetaHat(model1, signals_df$x2)
model1_yHat = model1 %*% model1_thetaHat
print("Model 1 θhat")
## [1] "Model 1 θhat"
print(model1_thetaHat[,1])
## [1] 0.88481155 0.05485661 -0.12167769
print("Model 1 Yhat")
## [1] "Model 1 Yhat"
print(model1_yHat[1:5,])
## [1] -2.098523 -1.874741 -1.689824 -0.910807 1.921451
model2 = generateModel2(signals_df)
model2_thetaHat = thetaHat(model2, signals_df$x2)
model2_yHat = model2 %*% model2_thetaHat
print("Model 2 θhat")
## [1] "Model 2 θhat"
print(model2_thetaHat[,1])
## [1] 0.76073534 0.03927117 0.16073311 -0.05929464
print("Model 2 Yhat")
## [1] "Model 2 Yhat"
print(model2_yHat[1:5,])
## [1] -2.0012442 -2.3887923 -1.5050543 -0.3947349 1.3963784
model3 = generateModel3(signals_df)
model3_thetaHat = thetaHat(model3, signals_df$x2)
model3_yHat = model3 %*% model3_thetaHat
print("Model 3 θhat")
## [1] "Model 3 θhat"
print(model3_thetaHat[,1])
## [1] -0.110359479 0.921205946 0.005305333
print("Model 3 Yhat")
## [1] "Model 3 Yhat"
print(model3_yHat[1:5,])
## [1] -2.0344839 -2.8071515 -1.6140479 -0.9302311 1.4969365
model4 = generateModel4(signals_df)
model4_thetaHat = thetaHat(model4, signals_df$x2)
model4_yHat = model4 %*% model4_thetaHat
print("Model 4 θhat")
## [1] "Model 4 θhat"
print(model4_thetaHat[,1])
## [1] 0.835911791 0.040640457 0.004980744 -0.082086650
print("Model 4 Yhat")
## [1] "Model 4 Yhat"
print(model4_yHat[1:5,])
## [1] -1.9878459 -2.3857632 -1.5979092 -0.8461629 1.7618936
model5 = generateModel5(signals_df)
model5_thetaHat = thetaHat(model5, signals_df$x2)
model5_yHat = model5 %*% model5_thetaHat
print("Model 5 θhat")
## [1] "Model 5 θhat"
print(model5_thetaHat[,1])
## [1] 0.88614620 -0.03576666 0.07754541 -0.09919425
print("Model 5 Yhat")
## [1] "Model 5 Yhat"
print(model5_yHat[1:5,])
## [1] -2.0491971 -1.8148583 -1.6654869 -0.8298047 2.0274923
RSS = Σ(y_actual - y_predicted)^2
calculateRSS <- function(y, y_hat_model){
return (sum((y-y_hat_model)^2))
}
# Compute RSS for each candidate models
model1_RSS = calculateRSS(signals_df$x2, model1_yHat)
model2_RSS = calculateRSS(signals_df$x2, model2_yHat)
model3_RSS = calculateRSS(signals_df$x2, model3_yHat)
model4_RSS = calculateRSS(signals_df$x2, model4_yHat)
model5_RSS = calculateRSS(signals_df$x2, model5_yHat)
# Print the models and their RSS in a table
rsss = c(model1_RSS, model2_RSS, model3_RSS, model4_RSS, model5_RSS)
data.frame(Model=c("Model 1","Model 2", "Model 3", "Model 4","Model 5"),
RSS=rsss)
## Model RSS
## 1 Model 1 151.9524
## 2 Model 2 142.9336
## 3 Model 3 149.6405
## 4 Model 4 148.4341
## 5 Model 5 150.2694
calculateVariance <- function(N, RSS){
return (RSS/(N-1))
}
calculateLogLikelihood <- function(N, variance, RSS) {
return (-(N/2)*(log(2*pi))-(N/2)*(log(variance))-(1/(2*variance))*RSS)
}
# Total Number of rows/sample data
N = length(signals_df$x2)
# Calculate variances for each model
model1_variance = calculateVariance(N, model1_RSS)
model2_variance = calculateVariance(N, model2_RSS)
model3_variance = calculateVariance(N, model3_RSS)
model4_variance = calculateVariance(N, model4_RSS)
model5_variance = calculateVariance(N, model5_RSS)
# Calculate log-likelihood for each model
model1_likelihood = calculateLogLikelihood(N, model1_variance, model1_RSS)
model2_likelihood = calculateLogLikelihood(N, model2_variance, model2_RSS)
model3_likelihood = calculateLogLikelihood(N, model3_variance, model3_RSS)
model4_likelihood = calculateLogLikelihood(N, model4_variance, model4_RSS)
model5_likelihood = calculateLogLikelihood(N, model5_variance, model5_RSS)
# Display the obtained values in a table
variances <- c(model1_variance, model2_variance, model3_variance, model4_variance, model5_variance)
likelihoods <- c(model1_likelihood, model2_likelihood, model3_likelihood, model4_likelihood, model5_likelihood)
data.frame(Model=c("Model 1","Model 2", "Model 3", "Model 4","Model 5"),
Variance=variances,
Likelihood=likelihoods)
## Model Variance Likelihood
## 1 Model 1 0.7635800 -256.3140
## 2 Model 2 0.7182592 -250.1952
## 3 Model 3 0.7519622 -254.7808
## 4 Model 4 0.7459002 -253.9714
## 5 Model 5 0.7551225 -255.2002
# Calculate Akaike Information Criterion (AIC)
calculateAIC <- function(N, thetahat, likelihood){
k = length(thetahat)
return (2 * k-2 * likelihood)
}
model1_AIC = calculateAIC(N, model1_thetaHat, model1_likelihood)
model2_AIC = calculateAIC(N, model2_thetaHat, model2_likelihood)
model3_AIC = calculateAIC(N, model3_thetaHat, model3_likelihood)
model4_AIC = calculateAIC(N, model4_thetaHat, model4_likelihood)
model5_AIC = calculateAIC(N, model5_thetaHat, model5_likelihood)
modelLabels = c("Model 1", "Model 2", "Model 3", "Model 4", "Model 5")
aics = c(model1_AIC, model2_AIC, model3_AIC, model4_AIC, model5_AIC)
data.frame(Models = modelLabels, AIC = aics)
## Models AIC
## 1 Model 1 518.6279
## 2 Model 2 508.3905
## 3 Model 3 515.5616
## 4 Model 4 515.9427
## 5 Model 5 518.4004
calculateBIC <- function(N, thetahat, likelihood){
k = length(thetahat)
return (k* log(N) - 2*likelihood)
}
model1_BIC = calculateBIC(N, model1_thetaHat, model1_likelihood)
model2_BIC = calculateBIC(N, model2_thetaHat, model2_likelihood)
model3_BIC = calculateBIC(N, model3_thetaHat, model3_likelihood)
model4_BIC = calculateBIC(N, model4_thetaHat, model4_likelihood)
model5_BIC = calculateBIC(N, model5_thetaHat, model5_likelihood)
bics = c(model1_BIC, model2_BIC, model3_BIC, model4_BIC, model5_BIC)
data.frame(Models = modelLabels, BIC = bics)
## Models BIC
## 1 Model 1 528.5229
## 2 Model 2 521.5837
## 3 Model 3 525.4565
## 4 Model 4 529.1360
## 5 Model 5 531.5936
calculateError <- function(y, yHat){
return (y - yHat)
}
plotQQ <-function(modelError){
error_fig = ggplot(data.frame(modelError), aes(sample=modelError)) +
geom_qq(color = "#0A5EB0" ) +
geom_qq_line(color = "#FF4545" )
return (
ggplotly(error_fig) %>%
layout( plot_bgcolor='#F8F4EC',
xaxis = list(title=list(text="Theoritical Quantities",font=list(size=10) )),
yaxis = list(title=list(text="Sample Quantiles",font=list(size=10)))
)
)
}
model1_error = calculateError(signals_df$x2, model1_yHat)
qq1 <- plotQQ(model1_error)
model2_error = calculateError(signals_df$x2, model2_yHat)
qq2 <- plotQQ(model2_error)
model3_error = calculateError(signals_df$x2, model3_yHat)
qq3 <- plotQQ(model3_error)
model4_error = calculateError(signals_df$x2, model4_yHat)
qq4 <- plotQQ(model4_error)
model5_error = calculateError(signals_df$x2, model5_yHat)
qq5 <- plotQQ(model5_error)
plotly::subplot(qq1, qq2, qq3, qq4,qq5, nrows = 3, shareX = FALSE, titleX =TRUE, titleY = TRUE ) %>%
layout( plot_bgcolor = '#F8F4EC',
annotations = list(
list(x = 0.0, y = 1.0, text = "QQ plot of model 1", showarrow = FALSE, xref = "paper", yref = "paper"),
list(x = 0.57, y = 1.0, text = "QQ plot of model 2", showarrow = FALSE, xref = "paper", yref = "paper"),
list(x = 0.0, y = 0.62, text = "QQ plot of model 3", showarrow = FALSE, xref = "paper", yref = "paper"),
list(x = 0.57, y = 0.62, text = "QQ plot of model 4", showarrow = FALSE, xref = "paper", yref = "paper"),
list(x = 0.0, y = 0.28, text = "QQ plot of model 5", showarrow = FALSE, xref = "paper", yref = "paper")
)
)
data.frame(Model = modelLabels, RSS = rsss, AIC = aics, BIC = bics, Likelihood = likelihoods)
## Model RSS AIC BIC Likelihood
## 1 Model 1 151.9524 518.6279 528.5229 -256.3140
## 2 Model 2 142.9336 508.3905 521.5837 -250.1952
## 3 Model 3 149.6405 515.5616 525.4565 -254.7808
## 4 Model 4 148.4341 515.9427 529.1360 -253.9714
## 5 Model 5 150.2694 518.4004 531.5936 -255.2002
# Generate training and testing data
set.seed(123)
split_data = initial_split(signals_df, prop = .7)
training_data = training(split_data)
testing_data = testing(split_data)
training_model = generateModel2(training_data)
testing_model = generateModel2(testing_data)
training_thetaHat <- thetaHat(training_model, training_data$x2)
training_yHat = training_model %*% training_thetaHat
testing_yHat = testing_model %*% training_thetaHat
ttest = t.test(training_yHat, testing_yHat, mu = 100, alternative = "two.sided", conf.level=0.95)
ttest
##
## Welch Two Sample t-test
##
## data: training_yHat and testing_yHat
## t = -461.93, df = 113.77, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 100
## 95 percent confidence interval:
## 0.008496225 0.862477269
## sample estimates:
## mean of x mean of y
## 0.02454843 -0.41093832
confidence_interval1 = ttest$conf.int[1] # Lower bound of 95% CI
confidence_interval12 = ttest$conf.int[2] # Upper bound of 95% CI
std_error = ttest$stderr # Standard error of the mean difference
c_i = c(confidence_interval1,confidence_interval12)
print("Confidence Interval")
## [1] "Confidence Interval"
c_i
## [1] 0.008496225 0.862477269
print("Standard Error")
## [1] "Standard Error"
std_error
## [1] 0.215539
# Plot the density distribution of training and testing data results
training_plot = ggplot(training_data, aes(x = x2)) +
stat_density(geom="line", color = "#0A5EB0") +
geom_vline(xintercept = confidence_interval1, linetype="dashed", color="#FF4545")+
geom_vline(xintercept = confidence_interval12, linetype="dashed", color="#FF4545")+
geom_vline(xintercept = std_error, linetype="dashed", color="#424242") +
labs(x="X2", y="Density")
testing_plot = ggplot(testing_data, aes(x = x2)) +
stat_density(geom="line", color = "#0A5EB0") +
geom_vline(xintercept = confidence_interval1, linetype="dashed", color="#FF4545")+
geom_vline(xintercept = confidence_interval12, linetype="dashed", color="#FF4545")+
geom_vline(xintercept = std_error, linetype="dashed", color="#424242") +
labs(x="X2", y="Density")
subplot(training_plot, testing_plot, nrows=1, shareX = FALSE,
titleY = TRUE, titleX = TRUE) %>%
layout(plot_bgcolor='#F8F4EC', title="Distributions of Training and Testing Data",
annotations = list(
list(x = 0.25, y = -0.05, text = "Training", showarrow = FALSE, xref = "paper", yref = "paper"),
list(x = 0.75, y = -0.05, text = "Testing", showarrow = FALSE, xref = "paper", yref = "paper")
)
)
# Select the top two largest absolute parameters from Model 2
abs_theta <- abs(model2_thetaHat)
selected_ids <- order(abs_theta, decreasing = TRUE)[1:2]
theta1_id <- selected_ids[1]
theta2_id <- selected_ids[2]
# Sample size to perform the ABC rejection
sample_size <- 1500
# Generate a series of uniform parameters for the selected coefficients centered around
# the estimated parameters with a range of +/- 0.5.
theta1_prior <- model2_thetaHat[theta1_id] + runif(sample_size, -0.5, 0.5)
theta2_prior <- model2_thetaHat[theta2_id] + runif(sample_size, -0.5, 0.5)
prior_df <- data.frame(theta1_prior=theta1_prior, theta2_prior=theta2_prior)
skim(prior_df)
| Name | prior_df |
| Number of rows | 1500 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| theta1_prior | 0 | 1 | 0.75 | 0.29 | 0.26 | 0.51 | 0.75 | 1.01 | 1.26 | ▇▇▇▇▇ |
| theta2_prior | 0 | 1 | 0.16 | 0.29 | -0.34 | -0.10 | 0.16 | 0.41 | 0.66 | ▇▇▇▇▇ |
head(prior_df,10)
## theta1_prior theta2_prior
## 1 0.7757651 0.59664042
## 2 0.6633087 0.50602678
## 3 1.1409819 0.37691983
## 4 0.6248272 -0.21546054
## 5 0.5489746 0.25556057
## 6 0.4313806 0.65058591
## 7 0.4329071 0.27614160
## 8 0.7427779 -0.06618922
## 9 0.5137003 -0.27556004
## 10 0.4769901 -0.26696361
# Empty vectors to store the accepted parameters
theta1_accepted <- c()
theta2_accepted <- c()
# Threshold epsilon which is the minimum value for the RSS to be accepted, with
# k as 1.5, meaning the abc RSS can only be 1.5 times greater than the actual RSS of model2
epsilon_threshold <- 1.5 * sum((signals_df$x2 - model2_yHat)^2)
# The count of the accepted parameters or posterior
counter <- 0
for (i in 1:sample_size) {
# Construct a new parameter matrix containing the sampled parameters for the prior parameters,
# and fixed initial parameters from parameter estimates of model 2
parameter_matrix <- model2_thetaHat
parameter_matrix[theta1_id] <- theta1_prior[i]
parameter_matrix[theta2_id] <- theta2_prior[i]
# Compute outputs using the parameters from prior distribution
abc_yHat <- model2 %*% parameter_matrix
# Compute RSS
abc_RSS <- sum((signals_df$x2 - abc_yHat)^2)
# Accept/reject the computed yHat based on whether it is less than the threshold value or epsilon
if (abc_RSS < epsilon_threshold) {
# store the accepted prior parameters to the posterior parameter
theta1_accepted <- c(theta1_accepted, theta1_prior[i])
theta2_accepted <- c(theta2_accepted, theta2_prior[i])
counter <- counter + 1 # increase the posterior counter
}
}
# Store the accepted parameters by combining them into a dataframe
posterior_df <- data.frame(theta1 = theta1_accepted, theta2 = theta2_accepted)
skim(posterior_df)
| Name | posterior_df |
| Number of rows | 772 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| theta1 | 0 | 1 | 0.75 | 0.26 | 0.26 | 0.54 | 0.74 | 0.96 | 1.26 | ▅▇▇▇▅ |
| theta2 | 0 | 1 | 0.17 | 0.22 | -0.27 | -0.01 | 0.16 | 0.34 | 0.59 | ▅▇▇▇▅ |
head(posterior_df, 10)
## theta1 theta2
## 1 0.6633087 0.50602678
## 2 0.5489746 0.25556057
## 3 0.4329071 0.27614160
## 4 0.7427779 -0.06618922
## 5 0.6126240 0.21905205
## 6 0.6696793 0.17339426
## 7 1.1795927 -0.22946529
## 8 0.5432637 0.47964066
## 9 0.9891298 0.21737416
## 10 0.9471104 -0.02539570
# Joint Posterior Distribution (Scatter Plot)
joint_plot <- ggplot(posterior_df, aes(x = theta1, y = theta2)) +
geom_point(alpha = 0.5, color = "#7E5CAD") +
labs(title = "Joint Posterior Distribution", x = "Theta1", y = "Theta2") +
theme_minimal()
joint_plot
# Marginal Posterior for Theta1
marginal_theta1 <- ggplot(posterior_df, aes(x = theta1)) +
geom_histogram(bins = 30, fill = "#5A72A0", alpha = 0.7) +
labs(title = "Marginal Posterior for Theta1", x = "Theta1", y = "Density") +
theme_minimal()
# Marginal Posterior for Theta2
marginal_theta2 <- ggplot(posterior_df, aes(x = theta2)) +
geom_histogram(bins = 30, fill = "#FF4545", alpha = 0.7) +
labs(title = "Marginal Posterior for Theta2", x = "Theta2", y = "Density") +
theme_minimal()
# Combine Plots
combined_plot <- (marginal_theta1 + marginal_theta2)
combined_plot <- combined_plot + plot_annotation(title = "ABC Posterior and Data Distributions")
combined_plot
# Distribution of Data (X2)
data_distribution <- ggplot(signals_df, aes(x = x2)) +
geom_histogram(aes(y = after_stat(density)), bins = 20, fill = "#5A72A0", alpha = 0.8) +
geom_density(color = "#FF4545", linetype = "dashed", lwd=1.5) +
labs(title = "Distribution of X2 (Output Data)", x = "X2", y = "Density") +
theme_minimal()
data_distribution